{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e432145d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "92448cd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "iris = load_iris()  # as_frame 默认为False 返回numpy的array 为True返回pandas的DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b373a5ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sklearn.utils._bunch.Bunch"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(iris)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "efea5151",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7787e2fd",
   "metadata": {},
   "source": [
    "- data 数据内容\n",
    "- target 预估的目标\n",
    "- frame 当as_frame为True时的DataFrame数据\n",
    "- target_names 多分类预估数据名称\n",
    "- DESCR description 描述信息\n",
    "- feature_names 特征列名称\n",
    "- filename 数据集在本地的缓存地址\n",
    "\n",
    "### 根据data里的每行各自的特征（列名为feature_names）预测target的内容，target的内容为0，1，2，对应target_names里三种不同的花的名称（分类）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "9f6c96d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[5.1, 3.5, 1.4, 0.2],\n",
       "       [4.9, 3. , 1.4, 0.2],\n",
       "       [4.7, 3.2, 1.3, 0.2],\n",
       "       [4.6, 3.1, 1.5, 0.2],\n",
       "       [5. , 3.6, 1.4, 0.2],\n",
       "       [5.4, 3.9, 1.7, 0.4],\n",
       "       [4.6, 3.4, 1.4, 0.3],\n",
       "       [5. , 3.4, 1.5, 0.2],\n",
       "       [4.4, 2.9, 1.4, 0.2],\n",
       "       [4.9, 3.1, 1.5, 0.1]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['data'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ed2218bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
       "       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n",
       "       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['target']  # 每一个数字代表一种花，这是真实值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b90c8777",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['setosa', 'versicolor', 'virginica'], dtype='<U10')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['target_names']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2a136ed7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sepal length (cm)',\n",
       " 'sepal width (cm)',\n",
       " 'petal length (cm)',\n",
       " 'petal width (cm)']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['feature_names']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5dba598e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'iris.csv'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['filename']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7fa8c0e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'sklearn.datasets.data'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iris['data_module']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a91607d",
   "metadata": {},
   "source": [
    "### 拆分训练集和测试集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5d56cee2",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0e7b5415",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = iris['data']\n",
    "target = iris['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4c7b9d74",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data, test_data, train_target, test_target = \\\n",
    "    train_test_split(data, target, test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "34ea9a23",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(105, 4) (105,)\n",
      "(45, 4) (45,)\n"
     ]
    }
   ],
   "source": [
    "print(train_data.shape, train_target.shape)\n",
    "print(test_data.shape, test_target.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8fbcbe3",
   "metadata": {},
   "source": [
    "### 逻辑回归训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "4b5072c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 加载模型\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "model = LogisticRegression(max_iter=1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "40fd959c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(max_iter=1000)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(max_iter=1000)</pre></div></div></div></div></div>"
      ],
      "text/plain": [
       "LogisticRegression(max_iter=1000)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(train_data, train_target)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d792ed58",
   "metadata": {},
   "source": [
    "#### 查看模型在数据上的分数\n",
    "#可以理解为准确率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "14bb3cce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9904761904761905"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.score(train_data, train_target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "c197762e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9555555555555556"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.score(test_data, test_target)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40565377",
   "metadata": {},
   "source": [
    "#### 上面的分数是根据data的预测值和target的真实进行比较得来的\n",
    "#这里通过预测也得到了一组预测值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "bbca2b5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_target = model.predict(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5ccec33c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(45,)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_target.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c2bc9c8",
   "metadata": {},
   "source": [
    "### 混淆矩阵\n",
    "#可以以此看出哪些数据错误预测到了哪个结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "46d1cf63",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 8,  0,  0],\n",
       "       [ 0, 20,  2],\n",
       "       [ 0,  0, 15]], dtype=int64)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import confusion_matrix  # 这是一个函数，返回混淆矩阵结果\n",
    "confusion_matrix(test_target, pred_target)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42f2679e",
   "metadata": {},
   "source": [
    "### 分类报告"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "91fe04bf",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       1.00      1.00      1.00         8\n",
      "           1       1.00      0.91      0.95        22\n",
      "           2       0.88      1.00      0.94        15\n",
      "\n",
      "    accuracy                           0.96        45\n",
      "   macro avg       0.96      0.97      0.96        45\n",
      "weighted avg       0.96      0.96      0.96        45\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import classification_report\n",
    "print(classification_report(test_target, pred_target))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa6ccbc4",
   "metadata": {},
   "source": [
    "针对**其中的一个结果**的预测效果，以target1为例解释\n",
    "- precision 精准率  \n",
    "    预测为1的当中实际上为1的/预测为1的  \n",
    "- recall 召回率  \n",
    "    实际上为1的当中预测为1的/实际上为1的  \n",
    "\n",
    "比如说实际为1的是一个大圈，预测为1的是当中的一个小圈，此时的精准率就是100%，因为预测为1的确实都是1  \n",
    "但是此时召回率会比较低，因为还有实际上为1的没有被预测出来  \n",
    "\n",
    "又比如说预测上为1的是一个大圈，实际上为1的是当中的一个小圈，此时召回率就是100%，因为实际上是1的都被预测为1  \n",
    "但是此时精准率会比较低，因为预测为1的当中有很多实际上不是1的（像是宁可错杀不可放过）  \n",
    "- f1-score  \n",
    "    对precision和recall的综合评价，公式：2精准率 * 召回率 / (精准率 + 召回率)  \n",
    "\n",
    "**对于总的所有结果**\n",
    "- 准确率 \n",
    "    正确预测的数量/总的数量"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "faf86820",
   "metadata": {},
   "source": [
    "### 分类列编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "921e2cba",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "941bba90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>0.00</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B    C\n",
       "0  x  10.00  yes\n",
       "1  y   0.00   no\n",
       "2  z  30.12  yes\n",
       "3  d  23.10  yes\n",
       "4  e  12.00   no"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z', 'd', 'e'], 'B': [10.0, 0, 30.12, 23.1, 12], 'C': ['yes', 'no', 'yes', 'yes', 'no']}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "eb69560c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7bb2409f",
   "metadata": {},
   "outputs": [],
   "source": [
    "lable_encoder = LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "9723d323",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['C'] = lable_encoder.fit_transform(df['C'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "92a41466",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B  C\n",
       "0  x  10.00  1\n",
       "1  y   0.00  0\n",
       "2  z  30.12  1\n",
       "3  d  23.10  1\n",
       "4  e  12.00  0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "adc8c977",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['no', 'yes'], dtype=object)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lable_encoder.classes_  # 存储的特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0e3a3291",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['yes', 'no', 'yes', 'yes', 'no'], dtype=object)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lable_encoder.inverse_transform(df['C'])  # 反向转换"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e02dd13b",
   "metadata": {},
   "source": [
    "### one hot编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "aab425ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6474bc98",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>easy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>0.00</td>\n",
       "      <td>hard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>hard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>easy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B       C\n",
       "0  x  10.00    easy\n",
       "1  y   0.00    hard\n",
       "2  z  30.12    hard\n",
       "3  d  23.10  normal\n",
       "4  e  12.00    easy"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z', 'd', 'e'], 'B': [10.0, 0, 30.12, 23.1, 12], 'C': ['easy', 'hard', 'hard', 'normal', 'easy']}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "8e08ed18",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "one_hot_encoder = OneHotEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "944fec78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  (0, 0)\t1.0\n",
      "  (1, 1)\t1.0\n",
      "  (2, 1)\t1.0\n",
      "  (3, 2)\t1.0\n",
      "  (4, 0)\t1.0\n"
     ]
    }
   ],
   "source": [
    "print(one_hot_encoder.fit_transform(df[['C']]))  # sparse_output默认为True，训练后返回稀疏矩阵，只给值为1的位置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "436cef69",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [0., 0., 1.],\n",
       "       [1., 0., 0.]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_hot_encoder = OneHotEncoder(sparse_output=False)  # 修改为False以显示整个矩阵\n",
    "one_hot_encoder.fit_transform(df[['C']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "034a753e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
